{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise 2: Advanced Analytics NLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:28:12.472305Z",
     "start_time": "2018-12-04T16:27:10.886061Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install spark-nlp==1.7.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:28:14.748410Z",
     "start_time": "2018-12-04T16:28:14.342555Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "pd.set_option('max_colwidth', 800)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create a spark context that includes a 3rd party jar for NLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:28:35.841869Z",
     "start_time": "2018-12-04T16:28:31.934985Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "            <div>\n",
       "                <p><b>SparkSession - in-memory</b></p>\n",
       "                \n",
       "        <div>\n",
       "            <p><b>SparkContext</b></p>\n",
       "\n",
       "            <p><a href=\"http://192.168.1.19:4041\">Spark UI</a></p>\n",
       "\n",
       "            <dl>\n",
       "              <dt>Version</dt>\n",
       "                <dd><code>v2.4.0</code></dd>\n",
       "              <dt>Master</dt>\n",
       "                <dd><code>local[2]</code></dd>\n",
       "              <dt>AppName</dt>\n",
       "                <dd><code>pyspark-shell</code></dd>\n",
       "            </dl>\n",
       "        </div>\n",
       "        \n",
       "            </div>\n",
       "        "
      ],
      "text/plain": [
       "<pyspark.sql.session.SparkSession at 0x111855e48>"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#jarPath = \"spark-nlp-assembly-1.7.3.jar\"\n",
    "\n",
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder \\\n",
    "    .config(\"spark.jars.packages\", \"JohnSnowLabs:spark-nlp:1.8.2\") \\\n",
    "    .getOrCreate()\n",
    "spark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read multiple files in a dir as one Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:28:40.343540Z",
     "start_time": "2018-12-04T16:28:35.844308Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100\n",
      "root\n",
      " |-- data: struct (nullable = true)\n",
      " |    |-- approved_at_utc: string (nullable = true)\n",
      " |    |-- approved_by: string (nullable = true)\n",
      " |    |-- archived: boolean (nullable = true)\n",
      " |    |-- author: string (nullable = true)\n",
      " |    |-- author_flair_background_color: string (nullable = true)\n",
      " |    |-- author_flair_css_class: string (nullable = true)\n",
      " |    |-- author_flair_richtext: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- author_flair_template_id: string (nullable = true)\n",
      " |    |-- author_flair_text: string (nullable = true)\n",
      " |    |-- author_flair_text_color: string (nullable = true)\n",
      " |    |-- author_flair_type: string (nullable = true)\n",
      " |    |-- author_fullname: string (nullable = true)\n",
      " |    |-- author_patreon_flair: boolean (nullable = true)\n",
      " |    |-- banned_at_utc: string (nullable = true)\n",
      " |    |-- banned_by: string (nullable = true)\n",
      " |    |-- can_gild: boolean (nullable = true)\n",
      " |    |-- can_mod_post: boolean (nullable = true)\n",
      " |    |-- category: string (nullable = true)\n",
      " |    |-- clicked: boolean (nullable = true)\n",
      " |    |-- content_categories: string (nullable = true)\n",
      " |    |-- contest_mode: boolean (nullable = true)\n",
      " |    |-- created: double (nullable = true)\n",
      " |    |-- created_utc: double (nullable = true)\n",
      " |    |-- crosspost_parent: string (nullable = true)\n",
      " |    |-- crosspost_parent_list: array (nullable = true)\n",
      " |    |    |-- element: struct (containsNull = true)\n",
      " |    |    |    |-- approved_at_utc: string (nullable = true)\n",
      " |    |    |    |-- approved_by: string (nullable = true)\n",
      " |    |    |    |-- archived: boolean (nullable = true)\n",
      " |    |    |    |-- author: string (nullable = true)\n",
      " |    |    |    |-- author_flair_background_color: string (nullable = true)\n",
      " |    |    |    |-- author_flair_css_class: string (nullable = true)\n",
      " |    |    |    |-- author_flair_richtext: array (nullable = true)\n",
      " |    |    |    |    |-- element: string (containsNull = true)\n",
      " |    |    |    |-- author_flair_template_id: string (nullable = true)\n",
      " |    |    |    |-- author_flair_text: string (nullable = true)\n",
      " |    |    |    |-- author_flair_text_color: string (nullable = true)\n",
      " |    |    |    |-- author_flair_type: string (nullable = true)\n",
      " |    |    |    |-- author_fullname: string (nullable = true)\n",
      " |    |    |    |-- author_patreon_flair: boolean (nullable = true)\n",
      " |    |    |    |-- banned_at_utc: string (nullable = true)\n",
      " |    |    |    |-- banned_by: string (nullable = true)\n",
      " |    |    |    |-- can_gild: boolean (nullable = true)\n",
      " |    |    |    |-- can_mod_post: boolean (nullable = true)\n",
      " |    |    |    |-- category: string (nullable = true)\n",
      " |    |    |    |-- clicked: boolean (nullable = true)\n",
      " |    |    |    |-- content_categories: string (nullable = true)\n",
      " |    |    |    |-- contest_mode: boolean (nullable = true)\n",
      " |    |    |    |-- created: double (nullable = true)\n",
      " |    |    |    |-- created_utc: double (nullable = true)\n",
      " |    |    |    |-- distinguished: string (nullable = true)\n",
      " |    |    |    |-- domain: string (nullable = true)\n",
      " |    |    |    |-- downs: long (nullable = true)\n",
      " |    |    |    |-- edited: boolean (nullable = true)\n",
      " |    |    |    |-- gilded: long (nullable = true)\n",
      " |    |    |    |-- gildings: struct (nullable = true)\n",
      " |    |    |    |    |-- gid_1: long (nullable = true)\n",
      " |    |    |    |    |-- gid_2: long (nullable = true)\n",
      " |    |    |    |    |-- gid_3: long (nullable = true)\n",
      " |    |    |    |-- hidden: boolean (nullable = true)\n",
      " |    |    |    |-- hide_score: boolean (nullable = true)\n",
      " |    |    |    |-- id: string (nullable = true)\n",
      " |    |    |    |-- is_crosspostable: boolean (nullable = true)\n",
      " |    |    |    |-- is_meta: boolean (nullable = true)\n",
      " |    |    |    |-- is_original_content: boolean (nullable = true)\n",
      " |    |    |    |-- is_reddit_media_domain: boolean (nullable = true)\n",
      " |    |    |    |-- is_robot_indexable: boolean (nullable = true)\n",
      " |    |    |    |-- is_self: boolean (nullable = true)\n",
      " |    |    |    |-- is_video: boolean (nullable = true)\n",
      " |    |    |    |-- likes: string (nullable = true)\n",
      " |    |    |    |-- link_flair_background_color: string (nullable = true)\n",
      " |    |    |    |-- link_flair_css_class: string (nullable = true)\n",
      " |    |    |    |-- link_flair_richtext: array (nullable = true)\n",
      " |    |    |    |    |-- element: string (containsNull = true)\n",
      " |    |    |    |-- link_flair_template_id: string (nullable = true)\n",
      " |    |    |    |-- link_flair_text: string (nullable = true)\n",
      " |    |    |    |-- link_flair_text_color: string (nullable = true)\n",
      " |    |    |    |-- link_flair_type: string (nullable = true)\n",
      " |    |    |    |-- locked: boolean (nullable = true)\n",
      " |    |    |    |-- media: string (nullable = true)\n",
      " |    |    |    |-- media_only: boolean (nullable = true)\n",
      " |    |    |    |-- mod_note: string (nullable = true)\n",
      " |    |    |    |-- mod_reason_by: string (nullable = true)\n",
      " |    |    |    |-- mod_reason_title: string (nullable = true)\n",
      " |    |    |    |-- mod_reports: array (nullable = true)\n",
      " |    |    |    |    |-- element: string (containsNull = true)\n",
      " |    |    |    |-- name: string (nullable = true)\n",
      " |    |    |    |-- no_follow: boolean (nullable = true)\n",
      " |    |    |    |-- num_comments: long (nullable = true)\n",
      " |    |    |    |-- num_crossposts: long (nullable = true)\n",
      " |    |    |    |-- num_reports: string (nullable = true)\n",
      " |    |    |    |-- over_18: boolean (nullable = true)\n",
      " |    |    |    |-- parent_whitelist_status: string (nullable = true)\n",
      " |    |    |    |-- permalink: string (nullable = true)\n",
      " |    |    |    |-- pinned: boolean (nullable = true)\n",
      " |    |    |    |-- pwls: long (nullable = true)\n",
      " |    |    |    |-- quarantine: boolean (nullable = true)\n",
      " |    |    |    |-- removal_reason: string (nullable = true)\n",
      " |    |    |    |-- report_reasons: string (nullable = true)\n",
      " |    |    |    |-- saved: boolean (nullable = true)\n",
      " |    |    |    |-- score: long (nullable = true)\n",
      " |    |    |    |-- secure_media: string (nullable = true)\n",
      " |    |    |    |-- selftext: string (nullable = true)\n",
      " |    |    |    |-- selftext_html: string (nullable = true)\n",
      " |    |    |    |-- send_replies: boolean (nullable = true)\n",
      " |    |    |    |-- spoiler: boolean (nullable = true)\n",
      " |    |    |    |-- stickied: boolean (nullable = true)\n",
      " |    |    |    |-- subreddit: string (nullable = true)\n",
      " |    |    |    |-- subreddit_id: string (nullable = true)\n",
      " |    |    |    |-- subreddit_name_prefixed: string (nullable = true)\n",
      " |    |    |    |-- subreddit_subscribers: long (nullable = true)\n",
      " |    |    |    |-- subreddit_type: string (nullable = true)\n",
      " |    |    |    |-- suggested_sort: string (nullable = true)\n",
      " |    |    |    |-- thumbnail: string (nullable = true)\n",
      " |    |    |    |-- title: string (nullable = true)\n",
      " |    |    |    |-- ups: long (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |    |    |    |-- user_reports: array (nullable = true)\n",
      " |    |    |    |    |-- element: string (containsNull = true)\n",
      " |    |    |    |-- view_count: string (nullable = true)\n",
      " |    |    |    |-- visited: boolean (nullable = true)\n",
      " |    |    |    |-- whitelist_status: string (nullable = true)\n",
      " |    |    |    |-- wls: long (nullable = true)\n",
      " |    |-- distinguished: string (nullable = true)\n",
      " |    |-- domain: string (nullable = true)\n",
      " |    |-- downs: long (nullable = true)\n",
      " |    |-- edited: boolean (nullable = true)\n",
      " |    |-- gilded: long (nullable = true)\n",
      " |    |-- gildings: struct (nullable = true)\n",
      " |    |    |-- gid_1: long (nullable = true)\n",
      " |    |    |-- gid_2: long (nullable = true)\n",
      " |    |    |-- gid_3: long (nullable = true)\n",
      " |    |-- hidden: boolean (nullable = true)\n",
      " |    |-- hide_score: boolean (nullable = true)\n",
      " |    |-- id: string (nullable = true)\n",
      " |    |-- is_crosspostable: boolean (nullable = true)\n",
      " |    |-- is_meta: boolean (nullable = true)\n",
      " |    |-- is_original_content: boolean (nullable = true)\n",
      " |    |-- is_reddit_media_domain: boolean (nullable = true)\n",
      " |    |-- is_robot_indexable: boolean (nullable = true)\n",
      " |    |-- is_self: boolean (nullable = true)\n",
      " |    |-- is_video: boolean (nullable = true)\n",
      " |    |-- likes: string (nullable = true)\n",
      " |    |-- link_flair_background_color: string (nullable = true)\n",
      " |    |-- link_flair_css_class: string (nullable = true)\n",
      " |    |-- link_flair_richtext: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- link_flair_template_id: string (nullable = true)\n",
      " |    |-- link_flair_text: string (nullable = true)\n",
      " |    |-- link_flair_text_color: string (nullable = true)\n",
      " |    |-- link_flair_type: string (nullable = true)\n",
      " |    |-- locked: boolean (nullable = true)\n",
      " |    |-- media: string (nullable = true)\n",
      " |    |-- media_only: boolean (nullable = true)\n",
      " |    |-- mod_note: string (nullable = true)\n",
      " |    |-- mod_reason_by: string (nullable = true)\n",
      " |    |-- mod_reason_title: string (nullable = true)\n",
      " |    |-- mod_reports: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- name: string (nullable = true)\n",
      " |    |-- no_follow: boolean (nullable = true)\n",
      " |    |-- num_comments: long (nullable = true)\n",
      " |    |-- num_crossposts: long (nullable = true)\n",
      " |    |-- num_reports: string (nullable = true)\n",
      " |    |-- over_18: boolean (nullable = true)\n",
      " |    |-- parent_whitelist_status: string (nullable = true)\n",
      " |    |-- permalink: string (nullable = true)\n",
      " |    |-- pinned: boolean (nullable = true)\n",
      " |    |-- pwls: long (nullable = true)\n",
      " |    |-- quarantine: boolean (nullable = true)\n",
      " |    |-- removal_reason: string (nullable = true)\n",
      " |    |-- report_reasons: string (nullable = true)\n",
      " |    |-- saved: boolean (nullable = true)\n",
      " |    |-- score: long (nullable = true)\n",
      " |    |-- secure_media: string (nullable = true)\n",
      " |    |-- selftext: string (nullable = true)\n",
      " |    |-- selftext_html: string (nullable = true)\n",
      " |    |-- send_replies: boolean (nullable = true)\n",
      " |    |-- spoiler: boolean (nullable = true)\n",
      " |    |-- stickied: boolean (nullable = true)\n",
      " |    |-- subreddit: string (nullable = true)\n",
      " |    |-- subreddit_id: string (nullable = true)\n",
      " |    |-- subreddit_name_prefixed: string (nullable = true)\n",
      " |    |-- subreddit_subscribers: long (nullable = true)\n",
      " |    |-- subreddit_type: string (nullable = true)\n",
      " |    |-- suggested_sort: string (nullable = true)\n",
      " |    |-- thumbnail: string (nullable = true)\n",
      " |    |-- title: string (nullable = true)\n",
      " |    |-- ups: long (nullable = true)\n",
      " |    |-- url: string (nullable = true)\n",
      " |    |-- user_reports: array (nullable = true)\n",
      " |    |    |-- element: string (containsNull = true)\n",
      " |    |-- view_count: string (nullable = true)\n",
      " |    |-- visited: boolean (nullable = true)\n",
      " |    |-- whitelist_status: string (nullable = true)\n",
      " |    |-- wls: long (nullable = true)\n",
      " |-- kind: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dataPath = \"./data/reddit/*.json\"\n",
    "df = spark.read.json(dataPath)\n",
    "print(df.count())\n",
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deal with Struct type to query subfields "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:28:40.819564Z",
     "start_time": "2018-12-04T16:28:40.345335Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Microsoft Corp said it has discovered hacking targeting democratic institutions, think tanks, and non-profit organizations in Europe.</td>\n",
       "      <td>jaykirsch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Deutsche Bank reportedly planned to extend the dates of $340 million in loans to Trump Organization to avoid a potential nightmare of chasing a sitting president for cash</td>\n",
       "      <td>canuck_burger</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Iranian \"morality police\" were forced to fire warning shots when a crowd intervened to prevent them from arresting two women for not wearing a hijab. The incident occurred in Tehran's northeastern Narmak neighbourhood on Friday night, and ended with a mob tearing the door off a police vehicle.</td>\n",
       "      <td>honolulu_oahu_mod</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Trump administration 'pushing Saudi nuclear deal' which could benefit company linked to Jared Kushner - Senior Trump administration officials pushed a project to share nuclear power technology with Saudi Arabia over the objections of ethics officials, according to a congressional report</td>\n",
       "      <td>madam1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NASA Happily Reports the Earth is Greener, With More Trees Than 20 Years Ago–and It's Thanks to China, India</td>\n",
       "      <td>purplexxx</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                                                                                                                                                    title  \\\n",
       "0                                                                                                                                                                   Microsoft Corp said it has discovered hacking targeting democratic institutions, think tanks, and non-profit organizations in Europe.   \n",
       "1                                                                                                                              Deutsche Bank reportedly planned to extend the dates of $340 million in loans to Trump Organization to avoid a potential nightmare of chasing a sitting president for cash   \n",
       "2  Iranian \"morality police\" were forced to fire warning shots when a crowd intervened to prevent them from arresting two women for not wearing a hijab. The incident occurred in Tehran's northeastern Narmak neighbourhood on Friday night, and ended with a mob tearing the door off a police vehicle.   \n",
       "3         Trump administration 'pushing Saudi nuclear deal' which could benefit company linked to Jared Kushner - Senior Trump administration officials pushed a project to share nuclear power technology with Saudi Arabia over the objections of ethics officials, according to a congressional report   \n",
       "4                                                                                                                                                                                            NASA Happily Reports the Earth is Greener, With More Trees Than 20 Years Ago–and It's Thanks to China, India   \n",
       "\n",
       "              author  \n",
       "0          jaykirsch  \n",
       "1      canuck_burger  \n",
       "2  honolulu_oahu_mod  \n",
       "3             madam1  \n",
       "4          purplexxx  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title = \"data.title\"\n",
    "author = \"data.author\"\n",
    "dfAuthorTilte = df.select(title, author)\n",
    "dfAuthorTilte.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Try to implement the equivalent of flatMap in dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:28:43.068755Z",
     "start_time": "2018-12-04T16:28:40.826537Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>to</td>\n",
       "      <td>58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>the</td>\n",
       "      <td>46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>of</td>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>in</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>for</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>and</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>from</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>on</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>with</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   word  count\n",
       "0    to     58\n",
       "1   the     46\n",
       "2    of     42\n",
       "3    in     41\n",
       "4     a     25\n",
       "5   for     20\n",
       "6   and     19\n",
       "7  from     12\n",
       "8    on     11\n",
       "9  with     10"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pyspark.sql.functions as F\n",
    "\n",
    "dfWordCount = df.select(F.explode(F.split(title,\"\\\\s+\")).alias(\"word\")).groupBy(\"word\").count().orderBy(F.desc(\"count\"))\n",
    "dfWordCount.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use an NLP libary to do Part-of-Speech Tagging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:29:10.967990Z",
     "start_time": "2018-12-04T16:28:43.072151Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- text: string (nullable = true)\n",
      " |-- author: string (nullable = true)\n",
      " |-- document: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- annotatorType: string (nullable = true)\n",
      " |    |    |-- begin: integer (nullable = false)\n",
      " |    |    |-- end: integer (nullable = false)\n",
      " |    |    |-- result: string (nullable = true)\n",
      " |    |    |-- metadata: map (nullable = true)\n",
      " |    |    |    |-- key: string\n",
      " |    |    |    |-- value: string (valueContainsNull = true)\n",
      " |-- token: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- annotatorType: string (nullable = true)\n",
      " |    |    |-- begin: integer (nullable = false)\n",
      " |    |    |-- end: integer (nullable = false)\n",
      " |    |    |-- result: string (nullable = true)\n",
      " |    |    |-- metadata: map (nullable = true)\n",
      " |    |    |    |-- key: string\n",
      " |    |    |    |-- value: string (valueContainsNull = true)\n",
      " |-- normal: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- annotatorType: string (nullable = true)\n",
      " |    |    |-- begin: integer (nullable = false)\n",
      " |    |    |-- end: integer (nullable = false)\n",
      " |    |    |-- result: string (nullable = true)\n",
      " |    |    |-- metadata: map (nullable = true)\n",
      " |    |    |    |-- key: string\n",
      " |    |    |    |-- value: string (valueContainsNull = true)\n",
      " |-- lemma: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- annotatorType: string (nullable = true)\n",
      " |    |    |-- begin: integer (nullable = false)\n",
      " |    |    |-- end: integer (nullable = false)\n",
      " |    |    |-- result: string (nullable = true)\n",
      " |    |    |-- metadata: map (nullable = true)\n",
      " |    |    |    |-- key: string\n",
      " |    |    |    |-- value: string (valueContainsNull = true)\n",
      " |-- pos: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- annotatorType: string (nullable = true)\n",
      " |    |    |-- begin: integer (nullable = false)\n",
      " |    |    |-- end: integer (nullable = false)\n",
      " |    |    |-- result: string (nullable = true)\n",
      " |    |    |-- metadata: map (nullable = true)\n",
      " |    |    |    |-- key: string\n",
      " |    |    |    |-- value: string (valueContainsNull = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp\n",
    "dfAnnotated = bp.annotate(dfAuthorTilte, \"title\")\n",
    "dfAnnotated.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deal with Map type to query subfields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:29:11.430140Z",
     "start_time": "2018-12-04T16:29:10.973865Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>metadata</th>\n",
       "      <th>result</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Microsoft Corp said it has discovered hacking targeting democratic institutions, think tanks, and non-profit organizations in Europe.</td>\n",
       "      <td>[{'word': 'Microsoft'}, {'word': 'Corp'}, {'word': 'said'}, {'word': 'it'}, {'word': 'has'}, {'word': 'discovered'}, {'word': 'hacking'}, {'word': 'targeting'}, {'word': 'democratic'}, {'word': 'institutions'}, {'word': 'think'}, {'word': 'tanks'}, {'word': 'and'}, {'word': 'nonprofit'}, {'word': 'organizations'}, {'word': 'in'}, {'word': 'Europe'}]</td>\n",
       "      <td>[NNP, NNP, VBD, PRP, VBZ, VBN, VBG, VBG, JJ, NNS, VBP, NNS, CC, NN, NNS, IN, NNP]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Deutsche Bank reportedly planned to extend the dates of $340 million in loans to Trump Organization to avoid a potential nightmare of chasing a sitting president for cash</td>\n",
       "      <td>[{'word': 'Deutsche'}, {'word': 'Bank'}, {'word': 'reportedly'}, {'word': 'planned'}, {'word': 'to'}, {'word': 'extend'}, {'word': 'the'}, {'word': 'dates'}, {'word': 'of'}, {'word': 'million'}, {'word': 'in'}, {'word': 'loans'}, {'word': 'to'}, {'word': 'Trump'}, {'word': 'Organization'}, {'word': 'to'}, {'word': 'avoid'}, {'word': 'a'}, {'word': 'potential'}, {'word': 'nightmare'}, {'word': 'of'}, {'word': 'chasing'}, {'word': 'a'}, {'word': 'sitting'}, {'word': 'president'}, {'word': 'for'}, {'word': 'cash'}]</td>\n",
       "      <td>[NNP, NNP, RB, VBD, TO, VB, DT, NNS, IN, CD, IN, NNS, TO, NNP, NNP, TO, VB, DT, JJ, NN, IN, VBG, DT, VBG, NN, IN, NN]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Iranian \"morality police\" were forced to fire warning shots when a crowd intervened to prevent them from arresting two women for not wearing a hijab. The incident occurred in Tehran's northeastern Narmak neighbourhood on Friday night, and ended with a mob tearing the door off a police vehicle.</td>\n",
       "      <td>[{'word': 'Iranian'}, {'word': 'morality'}, {'word': 'police'}, {'word': 'were'}, {'word': 'forced'}, {'word': 'to'}, {'word': 'fire'}, {'word': 'warning'}, {'word': 'shots'}, {'word': 'when'}, {'word': 'a'}, {'word': 'crowd'}, {'word': 'intervened'}, {'word': 'to'}, {'word': 'prevent'}, {'word': 'them'}, {'word': 'from'}, {'word': 'arresting'}, {'word': 'two'}, {'word': 'women'}, {'word': 'for'}, {'word': 'not'}, {'word': 'wearing'}, {'word': 'a'}, {'word': 'hijab'}, {'word': 'The'}, {'word': 'incident'}, {'word': 'occurred'}, {'word': 'in'}, {'word': 'Tehran'}, {'word': 's'}, {'word': 'northeastern'}, {'word': 'Narmak'}, {'word': 'neighbourhood'}, {'word': 'on'}, {'word': 'Friday'}, {'word': 'night'}, {'word': 'and'}, {'word': 'ended'}, {'word': 'with'}, {'word': 'a'}, {'word': 'mob'...</td>\n",
       "      <td>[JJ, NN, NN, VBD, VBN, TO, VB, NN, NNS, WRB, DT, NN, VBD, TO, VB, PRP, IN, VBG, CD, NNS, IN, RB, VBG, DT, NN, DT, NN, VBD, IN, NNP, VBZ, JJ, NNP, NN, IN, NNP, NN, CC, VBD, IN, DT, NN, VBG, DT, NN, RP, DT, NN, NN]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Trump administration 'pushing Saudi nuclear deal' which could benefit company linked to Jared Kushner - Senior Trump administration officials pushed a project to share nuclear power technology with Saudi Arabia over the objections of ethics officials, according to a congressional report</td>\n",
       "      <td>[{'word': 'Trump'}, {'word': 'administration'}, {'word': 'pushing'}, {'word': 'Saudi'}, {'word': 'nuclear'}, {'word': 'deal'}, {'word': 'which'}, {'word': 'could'}, {'word': 'benefit'}, {'word': 'company'}, {'word': 'linked'}, {'word': 'to'}, {'word': 'Jared'}, {'word': 'Kushner'}, {'word': 'Senior'}, {'word': 'Trump'}, {'word': 'administration'}, {'word': 'officials'}, {'word': 'pushed'}, {'word': 'a'}, {'word': 'project'}, {'word': 'to'}, {'word': 'share'}, {'word': 'nuclear'}, {'word': 'power'}, {'word': 'technology'}, {'word': 'with'}, {'word': 'Saudi'}, {'word': 'Arabia'}, {'word': 'over'}, {'word': 'the'}, {'word': 'objections'}, {'word': 'of'}, {'word': 'ethics'}, {'word': 'officials'}, {'word': 'according'}, {'word': 'to'}, {'word': 'a'}, {'word': 'congressional'}, {'word': 're...</td>\n",
       "      <td>[NNP, NN, VBG, NNP, NN, NN, WDT, MD, VB, NN, VBN, TO, NNP, NNP, NNP, NNP, NN, NNS, VBD, DT, NN, TO, VB, JJ, NN, NN, IN, NNP, NNP, IN, DT, NNS, IN, NNS, NNS, VBG, TO, DT, JJ, NN]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NASA Happily Reports the Earth is Greener, With More Trees Than 20 Years Ago–and It's Thanks to China, India</td>\n",
       "      <td>[{'word': 'NASA'}, {'word': 'Happily'}, {'word': 'Reports'}, {'word': 'the'}, {'word': 'Earth'}, {'word': 'is'}, {'word': 'Greener'}, {'word': 'With'}, {'word': 'More'}, {'word': 'Trees'}, {'word': 'Than'}, {'word': 'Years'}, {'word': 'Agoand'}, {'word': 'It'}, {'word': 's'}, {'word': 'Thanks'}, {'word': 'to'}, {'word': 'China'}, {'word': 'India'}]</td>\n",
       "      <td>[NNP, NNP, NNS, DT, NNP, VBZ, NNP, IN, JJR, NNP, IN, NNS, NNP, PRP, VBZ, NNS, TO, NNP, NNP]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                                                                                                                                                     text  \\\n",
       "0                                                                                                                                                                   Microsoft Corp said it has discovered hacking targeting democratic institutions, think tanks, and non-profit organizations in Europe.   \n",
       "1                                                                                                                              Deutsche Bank reportedly planned to extend the dates of $340 million in loans to Trump Organization to avoid a potential nightmare of chasing a sitting president for cash   \n",
       "2  Iranian \"morality police\" were forced to fire warning shots when a crowd intervened to prevent them from arresting two women for not wearing a hijab. The incident occurred in Tehran's northeastern Narmak neighbourhood on Friday night, and ended with a mob tearing the door off a police vehicle.   \n",
       "3         Trump administration 'pushing Saudi nuclear deal' which could benefit company linked to Jared Kushner - Senior Trump administration officials pushed a project to share nuclear power technology with Saudi Arabia over the objections of ethics officials, according to a congressional report   \n",
       "4                                                                                                                                                                                            NASA Happily Reports the Earth is Greener, With More Trees Than 20 Years Ago–and It's Thanks to China, India   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          metadata  \\\n",
       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                  [{'word': 'Microsoft'}, {'word': 'Corp'}, {'word': 'said'}, {'word': 'it'}, {'word': 'has'}, {'word': 'discovered'}, {'word': 'hacking'}, {'word': 'targeting'}, {'word': 'democratic'}, {'word': 'institutions'}, {'word': 'think'}, {'word': 'tanks'}, {'word': 'and'}, {'word': 'nonprofit'}, {'word': 'organizations'}, {'word': 'in'}, {'word': 'Europe'}]   \n",
       "1                                                                                                                                                                                                                                                                                            [{'word': 'Deutsche'}, {'word': 'Bank'}, {'word': 'reportedly'}, {'word': 'planned'}, {'word': 'to'}, {'word': 'extend'}, {'word': 'the'}, {'word': 'dates'}, {'word': 'of'}, {'word': 'million'}, {'word': 'in'}, {'word': 'loans'}, {'word': 'to'}, {'word': 'Trump'}, {'word': 'Organization'}, {'word': 'to'}, {'word': 'avoid'}, {'word': 'a'}, {'word': 'potential'}, {'word': 'nightmare'}, {'word': 'of'}, {'word': 'chasing'}, {'word': 'a'}, {'word': 'sitting'}, {'word': 'president'}, {'word': 'for'}, {'word': 'cash'}]   \n",
       "2  [{'word': 'Iranian'}, {'word': 'morality'}, {'word': 'police'}, {'word': 'were'}, {'word': 'forced'}, {'word': 'to'}, {'word': 'fire'}, {'word': 'warning'}, {'word': 'shots'}, {'word': 'when'}, {'word': 'a'}, {'word': 'crowd'}, {'word': 'intervened'}, {'word': 'to'}, {'word': 'prevent'}, {'word': 'them'}, {'word': 'from'}, {'word': 'arresting'}, {'word': 'two'}, {'word': 'women'}, {'word': 'for'}, {'word': 'not'}, {'word': 'wearing'}, {'word': 'a'}, {'word': 'hijab'}, {'word': 'The'}, {'word': 'incident'}, {'word': 'occurred'}, {'word': 'in'}, {'word': 'Tehran'}, {'word': 's'}, {'word': 'northeastern'}, {'word': 'Narmak'}, {'word': 'neighbourhood'}, {'word': 'on'}, {'word': 'Friday'}, {'word': 'night'}, {'word': 'and'}, {'word': 'ended'}, {'word': 'with'}, {'word': 'a'}, {'word': 'mob'...   \n",
       "3  [{'word': 'Trump'}, {'word': 'administration'}, {'word': 'pushing'}, {'word': 'Saudi'}, {'word': 'nuclear'}, {'word': 'deal'}, {'word': 'which'}, {'word': 'could'}, {'word': 'benefit'}, {'word': 'company'}, {'word': 'linked'}, {'word': 'to'}, {'word': 'Jared'}, {'word': 'Kushner'}, {'word': 'Senior'}, {'word': 'Trump'}, {'word': 'administration'}, {'word': 'officials'}, {'word': 'pushed'}, {'word': 'a'}, {'word': 'project'}, {'word': 'to'}, {'word': 'share'}, {'word': 'nuclear'}, {'word': 'power'}, {'word': 'technology'}, {'word': 'with'}, {'word': 'Saudi'}, {'word': 'Arabia'}, {'word': 'over'}, {'word': 'the'}, {'word': 'objections'}, {'word': 'of'}, {'word': 'ethics'}, {'word': 'officials'}, {'word': 'according'}, {'word': 'to'}, {'word': 'a'}, {'word': 'congressional'}, {'word': 're...   \n",
       "4                                                                                                                                                                                                                                                                                                                                                                                                                                                                   [{'word': 'NASA'}, {'word': 'Happily'}, {'word': 'Reports'}, {'word': 'the'}, {'word': 'Earth'}, {'word': 'is'}, {'word': 'Greener'}, {'word': 'With'}, {'word': 'More'}, {'word': 'Trees'}, {'word': 'Than'}, {'word': 'Years'}, {'word': 'Agoand'}, {'word': 'It'}, {'word': 's'}, {'word': 'Thanks'}, {'word': 'to'}, {'word': 'China'}, {'word': 'India'}]   \n",
       "\n",
       "                                                                                                                                                                                                                 result  \n",
       "0                                                                                                                                     [NNP, NNP, VBD, PRP, VBZ, VBN, VBG, VBG, JJ, NNS, VBP, NNS, CC, NN, NNS, IN, NNP]  \n",
       "1                                                                                                 [NNP, NNP, RB, VBD, TO, VB, DT, NNS, IN, CD, IN, NNS, TO, NNP, NNP, TO, VB, DT, JJ, NN, IN, VBG, DT, VBG, NN, IN, NN]  \n",
       "2  [JJ, NN, NN, VBD, VBN, TO, VB, NN, NNS, WRB, DT, NN, VBD, TO, VB, PRP, IN, VBG, CD, NNS, IN, RB, VBG, DT, NN, DT, NN, VBD, IN, NNP, VBZ, JJ, NNP, NN, IN, NNP, NN, CC, VBD, IN, DT, NN, VBG, DT, NN, RP, DT, NN, NN]  \n",
       "3                                     [NNP, NN, VBG, NNP, NN, NN, WDT, MD, VB, NN, VBN, TO, NNP, NNP, NNP, NNP, NN, NNS, VBD, DT, NN, TO, VB, JJ, NN, NN, IN, NNP, NNP, IN, DT, NNS, IN, NNS, NNS, VBG, TO, DT, JJ, NN]  \n",
       "4                                                                                                                           [NNP, NNP, NNS, DT, NNP, VBZ, NNP, IN, JJR, NNP, IN, NNS, NNP, PRP, VBZ, NNS, TO, NNP, NNP]  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfPos = dfAnnotated.select(\"text\", \"pos.metadata\", \"pos.result\")\n",
    "dfPos.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:29:12.012202Z",
     "start_time": "2018-12-04T16:29:11.432322Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- pos: struct (nullable = true)\n",
      " |    |-- annotatorType: string (nullable = true)\n",
      " |    |-- begin: integer (nullable = false)\n",
      " |    |-- end: integer (nullable = false)\n",
      " |    |-- result: string (nullable = true)\n",
      " |    |-- metadata: map (nullable = true)\n",
      " |    |    |-- key: string\n",
      " |    |    |-- value: string (valueContainsNull = true)\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pos</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(pos, 0, 8, NNP, {'word': 'Microsoft'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(pos, 10, 13, NNP, {'word': 'Corp'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(pos, 15, 18, VBD, {'word': 'said'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(pos, 20, 21, PRP, {'word': 'it'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(pos, 23, 25, VBZ, {'word': 'has'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(pos, 27, 36, VBN, {'word': 'discovered'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>(pos, 38, 44, VBG, {'word': 'hacking'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>(pos, 46, 54, VBG, {'word': 'targeting'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>(pos, 56, 65, JJ, {'word': 'democratic'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>(pos, 67, 78, NNS, {'word': 'institutions'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>(pos, 81, 85, VBP, {'word': 'think'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>(pos, 87, 91, NNS, {'word': 'tanks'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>(pos, 94, 96, CC, {'word': 'and'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>(pos, 98, 107, NN, {'word': 'nonprofit'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>(pos, 109, 121, NNS, {'word': 'organizations'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>(pos, 123, 124, IN, {'word': 'in'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>(pos, 126, 131, NNP, {'word': 'Europe'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>(pos, 0, 7, NNP, {'word': 'Deutsche'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>(pos, 9, 12, NNP, {'word': 'Bank'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>(pos, 14, 23, RB, {'word': 'reportedly'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>(pos, 25, 31, VBD, {'word': 'planned'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>(pos, 33, 34, TO, {'word': 'to'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>(pos, 36, 41, VB, {'word': 'extend'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>(pos, 43, 45, DT, {'word': 'the'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>(pos, 47, 51, NNS, {'word': 'dates'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>(pos, 53, 54, IN, {'word': 'of'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>(pos, 61, 67, CD, {'word': 'million'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>(pos, 69, 70, IN, {'word': 'in'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>(pos, 72, 76, NNS, {'word': 'loans'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>(pos, 78, 79, TO, {'word': 'to'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1599</th>\n",
       "      <td>(pos, 34, 35, IN, {'word': 'in'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1600</th>\n",
       "      <td>(pos, 37, 44, JJ, {'word': 'northern'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1601</th>\n",
       "      <td>(pos, 46, 49, NN, {'word': 'city'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1602</th>\n",
       "      <td>(pos, 0, 7, NNP, {'word': 'Adelaide'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1603</th>\n",
       "      <td>(pos, 9, 13, NN, {'word': 'pilot'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1604</th>\n",
       "      <td>(pos, 15, 20, VBZ, {'word': 'leaves'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1605</th>\n",
       "      <td>(pos, 23, 23, PRP, {'word': 'I'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1606</th>\n",
       "      <td>(pos, 24, 25, NN, {'word': 'm'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1607</th>\n",
       "      <td>(pos, 27, 31, JJ, {'word': 'bored'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1608</th>\n",
       "      <td>(pos, 34, 40, NN, {'word': 'message'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1609</th>\n",
       "      <td>(pos, 42, 44, CC, {'word': 'and'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1610</th>\n",
       "      <td>(pos, 46, 53, NN, {'word': 'graffiti'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1611</th>\n",
       "      <td>(pos, 55, 56, IN, {'word': 'on'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1612</th>\n",
       "      <td>(pos, 58, 63, NN, {'word': 'flight'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1613</th>\n",
       "      <td>(pos, 65, 69, NN, {'word': 'radar'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1614</th>\n",
       "      <td>(pos, 0, 8, NNS, {'word': 'Thousands'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1615</th>\n",
       "      <td>(pos, 10, 11, IN, {'word': 'of'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1616</th>\n",
       "      <td>(pos, 13, 18, JJ, {'word': 'French'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1617</th>\n",
       "      <td>(pos, 20, 26, NN, {'word': 'protest'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1618</th>\n",
       "      <td>(pos, 28, 32, IN, {'word': 'after'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1619</th>\n",
       "      <td>(pos, 34, 40, NN, {'word': 'country'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1620</th>\n",
       "      <td>(pos, 42, 48, VBD, {'word': 'gripped'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1621</th>\n",
       "      <td>(pos, 50, 51, IN, {'word': 'by'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1622</th>\n",
       "      <td>(pos, 53, 53, DT, {'word': 'a'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1623</th>\n",
       "      <td>(pos, 55, 58, NN, {'word': 'wave'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1624</th>\n",
       "      <td>(pos, 60, 61, IN, {'word': 'of'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1625</th>\n",
       "      <td>(pos, 63, 74, JJ, {'word': 'unapologetic'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1626</th>\n",
       "      <td>(pos, 76, 87, JJ, {'word': 'antisemitic'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1627</th>\n",
       "      <td>(pos, 89, 95, NNS, {'word': 'attacks'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1628</th>\n",
       "      <td>(pos, 98, 106, NN, {'word': 'vandalism'})</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1629 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  pos\n",
       "0             (pos, 0, 8, NNP, {'word': 'Microsoft'})\n",
       "1                (pos, 10, 13, NNP, {'word': 'Corp'})\n",
       "2                (pos, 15, 18, VBD, {'word': 'said'})\n",
       "3                  (pos, 20, 21, PRP, {'word': 'it'})\n",
       "4                 (pos, 23, 25, VBZ, {'word': 'has'})\n",
       "5          (pos, 27, 36, VBN, {'word': 'discovered'})\n",
       "6             (pos, 38, 44, VBG, {'word': 'hacking'})\n",
       "7           (pos, 46, 54, VBG, {'word': 'targeting'})\n",
       "8           (pos, 56, 65, JJ, {'word': 'democratic'})\n",
       "9        (pos, 67, 78, NNS, {'word': 'institutions'})\n",
       "10              (pos, 81, 85, VBP, {'word': 'think'})\n",
       "11              (pos, 87, 91, NNS, {'word': 'tanks'})\n",
       "12                 (pos, 94, 96, CC, {'word': 'and'})\n",
       "13          (pos, 98, 107, NN, {'word': 'nonprofit'})\n",
       "14    (pos, 109, 121, NNS, {'word': 'organizations'})\n",
       "15                (pos, 123, 124, IN, {'word': 'in'})\n",
       "16           (pos, 126, 131, NNP, {'word': 'Europe'})\n",
       "17             (pos, 0, 7, NNP, {'word': 'Deutsche'})\n",
       "18                (pos, 9, 12, NNP, {'word': 'Bank'})\n",
       "19          (pos, 14, 23, RB, {'word': 'reportedly'})\n",
       "20            (pos, 25, 31, VBD, {'word': 'planned'})\n",
       "21                  (pos, 33, 34, TO, {'word': 'to'})\n",
       "22              (pos, 36, 41, VB, {'word': 'extend'})\n",
       "23                 (pos, 43, 45, DT, {'word': 'the'})\n",
       "24              (pos, 47, 51, NNS, {'word': 'dates'})\n",
       "25                  (pos, 53, 54, IN, {'word': 'of'})\n",
       "26             (pos, 61, 67, CD, {'word': 'million'})\n",
       "27                  (pos, 69, 70, IN, {'word': 'in'})\n",
       "28              (pos, 72, 76, NNS, {'word': 'loans'})\n",
       "29                  (pos, 78, 79, TO, {'word': 'to'})\n",
       "...                                               ...\n",
       "1599                (pos, 34, 35, IN, {'word': 'in'})\n",
       "1600          (pos, 37, 44, JJ, {'word': 'northern'})\n",
       "1601              (pos, 46, 49, NN, {'word': 'city'})\n",
       "1602           (pos, 0, 7, NNP, {'word': 'Adelaide'})\n",
       "1603              (pos, 9, 13, NN, {'word': 'pilot'})\n",
       "1604           (pos, 15, 20, VBZ, {'word': 'leaves'})\n",
       "1605                (pos, 23, 23, PRP, {'word': 'I'})\n",
       "1606                 (pos, 24, 25, NN, {'word': 'm'})\n",
       "1607             (pos, 27, 31, JJ, {'word': 'bored'})\n",
       "1608           (pos, 34, 40, NN, {'word': 'message'})\n",
       "1609               (pos, 42, 44, CC, {'word': 'and'})\n",
       "1610          (pos, 46, 53, NN, {'word': 'graffiti'})\n",
       "1611                (pos, 55, 56, IN, {'word': 'on'})\n",
       "1612            (pos, 58, 63, NN, {'word': 'flight'})\n",
       "1613             (pos, 65, 69, NN, {'word': 'radar'})\n",
       "1614          (pos, 0, 8, NNS, {'word': 'Thousands'})\n",
       "1615                (pos, 10, 11, IN, {'word': 'of'})\n",
       "1616            (pos, 13, 18, JJ, {'word': 'French'})\n",
       "1617           (pos, 20, 26, NN, {'word': 'protest'})\n",
       "1618             (pos, 28, 32, IN, {'word': 'after'})\n",
       "1619           (pos, 34, 40, NN, {'word': 'country'})\n",
       "1620          (pos, 42, 48, VBD, {'word': 'gripped'})\n",
       "1621                (pos, 50, 51, IN, {'word': 'by'})\n",
       "1622                 (pos, 53, 53, DT, {'word': 'a'})\n",
       "1623              (pos, 55, 58, NN, {'word': 'wave'})\n",
       "1624                (pos, 60, 61, IN, {'word': 'of'})\n",
       "1625      (pos, 63, 74, JJ, {'word': 'unapologetic'})\n",
       "1626       (pos, 76, 87, JJ, {'word': 'antisemitic'})\n",
       "1627          (pos, 89, 95, NNS, {'word': 'attacks'})\n",
       "1628        (pos, 98, 106, NN, {'word': 'vandalism'})\n",
       "\n",
       "[1629 rows x 1 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfPos= dfAnnotated.select(F.explode(\"pos\").alias(\"pos\"))\n",
    "dfPos.printSchema()\n",
    "dfPos.toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Keep only proper nouns NNP or NNPS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:29:12.551881Z",
     "start_time": "2018-12-04T16:29:12.014196Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pos</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(pos, 0, 8, NNP, {'word': 'Microsoft'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(pos, 10, 13, NNP, {'word': 'Corp'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(pos, 126, 131, NNP, {'word': 'Europe'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(pos, 0, 7, NNP, {'word': 'Deutsche'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(pos, 9, 12, NNP, {'word': 'Bank'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>(pos, 81, 85, NNP, {'word': 'Trump'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>(pos, 87, 98, NNP, {'word': 'Organization'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>(pos, 175, 180, NNP, {'word': 'Tehran'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>(pos, 197, 202, NNP, {'word': 'Narmak'})</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>(pos, 221, 226, NNP, {'word': 'Friday'})</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            pos\n",
       "0       (pos, 0, 8, NNP, {'word': 'Microsoft'})\n",
       "1          (pos, 10, 13, NNP, {'word': 'Corp'})\n",
       "2      (pos, 126, 131, NNP, {'word': 'Europe'})\n",
       "3        (pos, 0, 7, NNP, {'word': 'Deutsche'})\n",
       "4           (pos, 9, 12, NNP, {'word': 'Bank'})\n",
       "5         (pos, 81, 85, NNP, {'word': 'Trump'})\n",
       "6  (pos, 87, 98, NNP, {'word': 'Organization'})\n",
       "7      (pos, 175, 180, NNP, {'word': 'Tehran'})\n",
       "8      (pos, 197, 202, NNP, {'word': 'Narmak'})\n",
       "9      (pos, 221, 226, NNP, {'word': 'Friday'})"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nnpFilter = \"pos.result = 'NNP' or pos.result = 'NNPS' \"\n",
    "dfNNP = dfPos.where(nnpFilter)\n",
    "dfNNP.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract columns form a map in a col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-04T16:29:12.811100Z",
     "start_time": "2018-12-04T16:29:12.556429Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Microsoft</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Corp</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Europe</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Deutsche</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Bank</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Trump</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Organization</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Tehran</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Narmak</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Friday</td>\n",
       "      <td>NNP</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           word  tag\n",
       "0     Microsoft  NNP\n",
       "1          Corp  NNP\n",
       "2        Europe  NNP\n",
       "3      Deutsche  NNP\n",
       "4          Bank  NNP\n",
       "5         Trump  NNP\n",
       "6  Organization  NNP\n",
       "7        Tehran  NNP\n",
       "8        Narmak  NNP\n",
       "9        Friday  NNP"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfWordTag = dfNNP.selectExpr(\"pos.metadata['word'] as word\", \"pos.result as tag\")\n",
    "dfWordTag.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+\n",
      "|    word|count|\n",
      "+--------+-----+\n",
      "|      US|   14|\n",
      "|   Trump|    9|\n",
      "|   Saudi|    8|\n",
      "|   Putin|    7|\n",
      "|  Russia|    6|\n",
      "|  Europe|    5|\n",
      "|  Arabia|    5|\n",
      "|Catholic|    4|\n",
      "|      UK|    4|\n",
      "|Vladimir|    4|\n",
      "| Germany|    3|\n",
      "|   China|    3|\n",
      "|    Pope|    3|\n",
      "|   Egypt|    3|\n",
      "|   South|    3|\n",
      "|   House|    3|\n",
      "|  Church|    3|\n",
      "|  Africa|    2|\n",
      "|National|    2|\n",
      "|  Health|    2|\n",
      "+--------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.functions import desc\n",
    "dfWordTag.groupBy(\"word\").count().orderBy(desc(\"count\")).show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {
    "height": "84px",
    "width": "160px"
   },
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "236px"
   },
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
